Set/check knitR option and working directory
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.0 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(here)
## here() starts at /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes
library(adegenet)
## Loading required package: ade4
## Registered S3 method overwritten by 'spdep':
## method from
## plot.mst ape
##
## /// adegenet 2.1.1 is loaded ////////////
##
## > overview: '?adegenet'
## > tutorials/doc/questions: 'adegenetWeb()'
## > bug reports/feature requests: adegenetIssues()
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(cluster)
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes"
Generate parameter index by correcting parameters by JE2 for each experiment
source("Functions/all_functions.R")
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
JE2_GC_median_param_long <- read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_parameters_GC.csv") %>%
filter(sample_id == "JE2") %>%
select_if(grepl("OD", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
group_by(experiment) %>%
summarise_if(is.numeric, median) %>%
ungroup() %>%
pivot_longer(ends_with("OD")) %>%
rename(JE2_value = value)
JE2_PI_median_param_long <- read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_parameters_PI.csv") %>%
filter(sample_id == "JE2") %>%
select_if(grepl("death", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
group_by(experiment) %>%
summarise_if(is.numeric, median) %>%
ungroup() %>%
pivot_longer(ends_with("death")) %>%
rename(JE2_value = value)
all_GC_param_JE2_corrected <- read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_parameters_GC.csv") %>%
filter(strain_group != "CONTROL") %>%
select_if(grepl("OD", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
pivot_longer(ends_with("OD")) %>%
merge(., JE2_GC_median_param_long, by = c("experiment", "name"), all.x = T, all.y = F) %>%
filter(!is.na(JE2_value)) %>%
mutate(value = value/JE2_value) %>%
group_by(sample_id, name) %>%
summarise_if(is.numeric, median) %>%
ungroup() %>%
select(-JE2_value) %>%
pivot_wider()
all_PI_param_JE2_corrected <- read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_parameters_PI.csv") %>%
filter(strain_group != "CONTROL") %>%
select_if(grepl("death", names(.)) | grepl("sample_id", names(.)) | grepl("experiment", names(.))) %>%
pivot_longer(ends_with("death")) %>%
merge(., JE2_PI_median_param_long, by = c("experiment", "name"), all.x = T, all.y = F) %>%
filter(!is.na(JE2_value)) %>%
mutate(value = value/JE2_value) %>%
group_by(sample_id, name) %>%
summarise_if(is.numeric, median) %>%
ungroup() %>%
select(-JE2_value) %>%
pivot_wider()
merge corrected data
sample_meta.df <- read.csv("Ideas_Grant_2020_analysis/Raw_data/strain_metadata_corrected_mortality_with_controls.csv")
median_sample_parameters_all.df <- merge(all_GC_param_JE2_corrected, all_PI_param_JE2_corrected, by = "sample_id") %>%
select(-time_of_min_OD, -time_of_min_death, -end_point_timepoint_OD, -end_point_timepoint_death, -min_OD, -min_death, -doubling_time_OD, -doubling_time_death ) %>%
merge(., sample_meta.df, by = "sample_id") %>%
mutate(ST_simp = fct_lump(ST, n = 5)) %>%
filter(!is.na(mortality))
median_param_dat.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.)))
Check each variable individually
for (var in str_extract(colnames(median_sample_parameters_all.df), ".*OD*") %>% na.omit()) {
t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
geom_density(aes(fill = mortality), alpha = .5)
print(t)
}






for (var in str_extract(colnames(median_sample_parameters_all.df), ".*death*") %>% na.omit()) {
t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
geom_density(aes(fill = mortality), alpha = .5)
print(t)
}






for (var in str_extract(colnames(median_sample_parameters_all.df), ".*OD*") %>% na.omit()) {
t <- ggviolin(data = median_sample_parameters_all.df,
y = var,
x = "mortality",
fill = "mortality") +
theme_bw() +
theme(legend.position = "none")+
stat_compare_means(ref.group ="Survived",
method = "wilcox.test",
label = "p.signif")
print(t)
}






for (var in str_extract(colnames(median_sample_parameters_all.df), ".*death*") %>% na.omit()) {
t <- ggviolin(data = median_sample_parameters_all.df,
y = var,
x = "mortality",
fill = "mortality") +
theme_bw() +
theme(legend.position = "none")+
stat_compare_means(ref.group ="Survived",
method = "wilcox.test",
label = "p.signif")
print(t)
}






Check vriable correlation with correlogram
M <-cor(median_param_dat.df, method = "pearson")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "kendall")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "spearman")
corrplot(M, type="upper")

Contributions of variables to PC1
fviz_contrib(pca1, choice = "var", axes = 1)

Contributions of variables to PC2
fviz_contrib(pca1, choice = "var", axes = 2)

Contribution of variables to PC1 to 4
fviz_contrib(pca1, choice = "var", axes = 1:4)

Rerun PCA with variable with max contrib.
pca.df <- median_param_dat.df %>%
select(max_death, end_point_OD, AUC_death, end_point_death, max_OD, AUC_OD, max_rate_death) %>%
# select(AUC_death, AUC_OD, max_OD, max_rate_death, max_rate_OD, doubling_time_OD) %>%
filter(!is.na(median_sample_parameters_all.df$mortality))
#row.names(pca.df) <- median_sample_parameters_all.df$sample_id
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)
Plot % variance explained per axes
fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .75,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp ,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

Compute Discriminant Analysis of Principal Components (DAPC)
pca.df <- median_param_dat.df %>%
filter(!is.na(median_sample_parameters_all.df$mortality))
row.names(pca.df) <- median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$sample_id
# All vs All
dapc1 <- adegenet::dapc(x = pca.df,
grp = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality == "Died",
n.pca = 6,
n.da = 2,
scale = T,
center = T)
summary(dapc1)
## $n.dim
## [1] 1
##
## $n.pop
## [1] 2
##
## $assign.prop
## [1] 0.8
##
## $assign.per.pop
## FALSE TRUE
## 0.99295775 0.07894737
##
## $prior.grp.size
##
## FALSE TRUE
## 142 38
##
## $post.grp.size
##
## FALSE TRUE
## 176 4
scatter.dapc(dapc1)

dapc1$var.contr
## LD1
## AUC_OD 6.457533e-03
## end_point_OD 1.040002e-01
## max_OD 4.491477e-02
## max_rate_OD 4.053661e-01
## time_of_max_OD 1.198624e-01
## time_of_max_rate_OD 3.476192e-02
## AUC_death 3.201048e-03
## end_point_death 2.978451e-06
## max_death 1.021493e-02
## max_rate_death 6.925194e-02
## time_of_max_death 8.697950e-02
## time_of_max_rate_death 1.149867e-01
loadingplot(dapc1$var.contr)

dapc0 <-data.frame(comparison = "All vs ALL",
var = row.names(dapc1$var.contr),
contrib = as.character(dapc1$var.contr),
row.names = NULL)
reconpute PCA with only best mortality discriminating variable
pca.df <- pca.df %>%
select(time_of_max_OD, max_rate_OD, time_of_max_rate_death )
pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)
Plot % variance explained per axes
fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group
# mortality
fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

fviz_pca_biplot(pca1,
#select.ind = list(contrib = 10),
axes = c(1,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
title = paste("PCA with variable loading" ),
#repel = T,
pointshape = 16,
)

Try random forest classifier
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
mutate(ID = row.names(.))
rf.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
mutate(mortality = median_sample_parameters_all.df$mortality) %>%
mutate(ID = median_sample_parameters_all.df$ID) %>%
mutate(ST_simp = median_sample_parameters_all.df$ST_simp)
# can filter or sample here
rf.df <- rf.df %>%
filter(!is.na(mortality))
rownames(rf.df) <- rf.df$ID
set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]
pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp)
resp_rf.vec <- rf.df$mortality
model1 <- randomForest(x = pred_rf.df, y= resp_rf.vec, data = EM.rf.df, importance = TRUE, ntree = 300)
model1
##
## Call:
## randomForest(x = pred_rf.df, y = resp_rf.vec, ntree = 300, importance = TRUE, data = EM.rf.df)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 15%
## Confusion matrix:
## Died Survived class.error
## Died 13 25 0.65789474
## Survived 2 140 0.01408451
varImpPlot(model1)

# Predicting on Validation set
predValid <- predict(model1, rf.df, type = "class")
predValid <- as.data.frame(predValid) %>%
mutate(ID = rownames(.)) %>%
rename(Prediction = predValid) %>%
merge(., rf.df %>% select(ID, mortality), by = "ID")%>%
droplevels() %>%
mutate(Accurate = ifelse(Prediction == mortality, T, F))
# Predict class proba
predValid_prob <- predict(model1, rf.df, type="prob")%>%
as.data.frame(.) %>%
mutate(ID = rownames(.)) %>%
merge(., predValid)
# Checking classification accuracy
mean(predValid$Accurate)
## [1] 1
table(predValid$Prediction, predValid$mortality)
##
## Died Survived
## Died 38 0
## Survived 0 142
table(predValid$mortality, predValid$Accurate)
##
## TRUE
## Died 38
## Survived 142
# Keep n best predicted for each strain
best_pred.df <- predValid_prob %>%
filter(Accurate == T) %>%
mutate(Accurate_proba = pmax(Died, Survived)) %>%
group_by(mortality) %>%
#filter(Accurate_proba > 0.9)
top_frac(.5, Accurate_proba)
count(best_pred.df, mortality)
## # A tibble: 2 x 2
## # Groups: mortality [2]
## mortality n
## <fct> <int>
## 1 Died 19
## 2 Survived 72
Re-compute PCA with best accurately classified
pca_meta.df3 <- rf.df %>%
filter(ID %in% best_pred.df$ID)
pca.df3 <- pca_meta.df3 %>%
select(-ID, -mortality, -ST_simp)
pca3 <- dudi.pca(df = pca.df3, scannf = F, nf = 5, scale = T)
Plot PCA
fviz_screeplot(pca3, addlabels = T)

fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$mortality,
col.circle = pca_meta.df3$mortality,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)

fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$mortality,
col.circle = pca_meta.df3$mortality,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)

fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .8,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df3$ST_simp,
col.circle = pca_meta.df3$ST_simp,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)

fviz_pca_biplot(pca3,
#select.ind = list(contrib = 10),
axes = c(2,3),
geom = c("point"),
alpha.ind = .95,
addEllipses = T,
#ellipse.level = .99,
col.ind = pca_meta.df3$ST_simp,
col.circle = pca_meta.df3$ST_simp,
title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
repel = T,
pointshape = 16)

Try random forest unsupervised
# library(randomForest)
# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
mutate(ID = row.names(.))
rf.df <- median_sample_parameters_all.df %>%
select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
mutate(mortality = median_sample_parameters_all.df$mortality) %>%
mutate(ID = median_sample_parameters_all.df$ID) %>%
mutate(ST_simp = median_sample_parameters_all.df$ST_simp) %>%
mutate(sample_id = median_sample_parameters_all.df$sample_id)
# can filter or sample here
rf.df <- rf.df %>%
filter(!is.na(mortality))
rownames(rf.df) <- rf.df$ID
set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]
pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp, -sample_id)
resp_rf.vec <- rf.df$mortality
model1 <- randomForest(x = pred_rf.df, data = EM.rf.df, importance = TRUE, ntree = 100)
model1
##
## Call:
## randomForest(x = pred_rf.df, ntree = 100, importance = TRUE, data = EM.rf.df)
## Type of random forest: unsupervised
## Number of trees: 100
## No. of variables tried at each split: 3
varImpPlot(model1)

prox <- model1$proximity
pam.rf <- pam(prox, 4)
pred <- cbind(pam.rf$clustering, as.character(rf.df$sample_id), as.character(rf.df$ST_simp), as.character(rf.df$mortality))
table(pred[,1], pred[,2])
##
## BPH2700 BPH2701 BPH2702 BPH2703 BPH2704 BPH2705 BPH2706 BPH2707
## 1 1 0 0 0 0 0 0 1
## 2 0 1 1 1 0 0 0 0
## 3 0 0 0 0 1 1 1 0
## 4 0 0 0 0 0 0 0 0
##
## BPH2708 BPH2709 BPH2710 BPH2711 BPH2712 BPH2713 BPH2714 BPH2715
## 1 0 0 0 0 0 0 0 0
## 2 0 1 1 1 0 0 0 0
## 3 0 0 0 0 1 0 0 0
## 4 1 0 0 0 0 1 1 1
##
## BPH2716 BPH2717 BPH2718 BPH2719 BPH2720 BPH2721 BPH2722 BPH2723
## 1 1 0 0 1 0 0 0 0
## 2 0 0 0 0 0 0 1 0
## 3 0 1 0 0 0 1 0 0
## 4 0 0 1 0 1 0 0 1
##
## BPH2724 BPH2725 BPH2726 BPH2727 BPH2728 BPH2729 BPH2730 BPH2731
## 1 0 0 0 0 0 0 0 0
## 2 0 0 1 1 0 0 0 1
## 3 0 1 0 0 0 0 0 0
## 4 1 0 0 0 1 1 1 0
##
## BPH2732 BPH2733 BPH2734 BPH2735 BPH2736 BPH2737 BPH2738 BPH2739
## 1 1 1 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 0 1 1 1 1 1 0
##
## BPH2740 BPH2741 BPH2742 BPH2743 BPH2744 BPH2745 BPH2746 BPH2747
## 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 1 1 1 1 1 1 0
##
## BPH2748 BPH2749 BPH2751 BPH2752 BPH2753 BPH2754 BPH2755 BPH2756
## 1 0 1 0 0 1 0 0 1
## 2 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 1 1 0 1 1 0
##
## BPH2757 BPH2758 BPH2759 BPH2760 BPH2761 BPH2763 BPH2764 BPH2765
## 1 0 0 0 0 0 0 1 0
## 2 0 1 0 0 0 1 0 1
## 3 1 0 1 1 0 0 0 0
## 4 0 0 0 0 1 0 0 0
##
## BPH2766 BPH2767 BPH2768 BPH2769 BPH2770 BPH2771 BPH2773 BPH2774
## 1 0 0 0 0 0 0 0 0
## 2 1 0 0 1 0 0 0 1
## 3 0 0 1 0 0 0 0 0
## 4 0 1 0 0 1 1 1 0
##
## BPH2775 BPH2776 BPH2777 BPH2778 BPH2779 BPH2780 BPH2781 BPH2782
## 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 1 0
## 3 0 0 0 0 0 0 0 0
## 4 0 1 1 1 1 0 0 1
##
## BPH2783 BPH2784 BPH2785 BPH2786 BPH2787 BPH2788 BPH2789 BPH2790
## 1 0 0 0 0 1 0 0 0
## 2 1 0 0 0 0 0 0 0
## 3 0 0 1 1 0 0 0 0
## 4 0 1 0 0 0 1 1 1
##
## BPH2791 BPH2792 BPH2793 BPH2794 BPH2795 BPH2796 BPH2797 BPH2798
## 1 0 1 0 0 0 0 1 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 1 1 1 0 0
## 4 1 0 1 0 0 0 0 1
##
## BPH2799 BPH2800 BPH2801 BPH2802 BPH2803 BPH2804 BPH2805 BPH2806
## 1 0 0 0 0 0 0 1 0
## 2 0 0 0 0 0 1 0 0
## 3 1 0 1 1 1 0 0 1
## 4 0 1 0 0 0 0 0 0
##
## BPH2807 BPH2808 BPH2809 BPH2810 BPH2811 BPH2840 BPH2841 BPH2842
## 1 0 0 0 0 0 1 1 0
## 2 0 0 0 0 1 0 0 1
## 3 0 0 0 1 0 0 0 0
## 4 1 1 1 0 0 0 0 0
##
## BPH2843 BPH2844 BPH2845 BPH2846 BPH2847 BPH2848 BPH2849 BPH2850
## 1 0 1 1 0 0 1 1 1
## 2 1 0 0 0 1 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 1 0 0 0 0
##
## BPH2851 BPH2852 BPH2853 BPH2854 BPH2855 BPH2856 BPH2857 BPH2858
## 1 0 0 0 0 0 0 0 0
## 2 1 0 0 1 1 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 1 1 0 0 1 1 1
##
## BPH2859 BPH2860 BPH2861 BPH2862 BPH2863 BPH2864 BPH2865 BPH2866
## 1 1 1 1 1 0 0 1 0
## 2 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 0 0 0 1 1 0 0
##
## BPH2867 BPH2902 BPH2904 BPH2942 BPH2943 BPH2962 BPH2963 BPH3336
## 1 0 0 0 1 0 0 0 0
## 2 0 0 0 0 0 0 1 0
## 3 0 0 0 0 0 0 0 0
## 4 1 1 1 0 1 1 0 1
##
## BPH3337 BPH3427 BPH3455 BPH3478 BPH3483 BPH3504 BPH3506 BPH3515
## 1 0 0 0 1 0 1 0 0
## 2 1 1 0 0 1 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 0 1 0 0 0 1 0
##
## BPH3521 BPH3525 BPH3533 BPH3535 BPH3541 BPH3549 BPH3560 BPH3563
## 1 1 1 0 0 0 0 0 0
## 2 0 0 0 1 0 0 1 0
## 3 0 0 0 0 0 0 0 0
## 4 0 0 1 0 1 1 0 1
##
## BPH3572 BPH3594 BPH3617 BPH3627 BPH3644 BPH3646 BPH3676 BPH3680
## 1 1 0 1 1 0 0 0 0
## 2 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0
## 4 0 1 0 0 1 1 1 1
##
## BPH3681 BPH3698 BPH3706 BPH3708 BPH3712 BPH3723 BPH3725 BPH3733
## 1 1 0 0 0 0 1 0 0
## 2 0 0 1 1 0 0 0 1
## 3 0 0 0 0 0 0 0 0
## 4 0 1 0 0 1 0 1 0
##
## BPH3734 BPH3747 BPH3752 BPH3757
## 1 0 0 0 0
## 2 1 1 0 1
## 3 0 0 0 0
## 4 0 0 1 0
table(pred[,1], pred[,3])
##
## - 22 239 45 5 Other
## 1 5 0 2 4 4 21
## 2 3 10 12 1 2 18
## 3 1 0 1 3 3 14
## 4 11 9 16 11 6 23
table(pred[,1], pred[,4])
##
## Died Survived
## 1 10 26
## 2 9 37
## 3 1 21
## 4 18 58
cluster.df <-cbind(pam.rf$clustering, as.character(rf.df$sample_id)) %>% as.data.frame()
colnames(cluster.df) <- c("cluster", "sample_id")
pca_meta.df4 <- rf.df %>%
merge(., cluster.df , by = "sample_id")
pca.df4 <- pca_meta.df4 %>%
select(-sample_id, -cluster, -ST_simp, -mortality, -ID)
pca4 <- dudi.pca(df = pca.df4, scannf = F, nf = 5, scale = T)
fviz_screeplot(pca4, addlabels = T)

fviz_pca_biplot(pca4,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df4$cluster,
title = paste("" ),
repel = T,
pointshape = 16)

fviz_pca_biplot(pca4,
#select.ind = list(contrib = 10),
axes = c(1,2),
geom = c("point"),
alpha.ind = .5,
addEllipses = T,
ellipse.level = .95,
col.ind = pca_meta.df4$mortality,
title = paste("" ),
repel = T,
pointshape = 16)
